######################################################################################
####  Replication Code for Glynn and Ichino, "Using Qualitative Information 
####  to Improve Causal Inference" (2014, American Journal of Political Science)
####  Adam N. Glynn and Nahomi Ichino
####  July 2014
######################################################################################

# The analyses in the article can be replicated with the R code below.  Some portions 
# are made significantly easier by using the R package qualCI v0.1 (Kashin, Glynn, 
# and Ichino 2014), which includes the data from the Africa example in the article and
# functions to calculate the p-values and qualitative confidence intervals. 


library(optmatch)
require(xtable)

# call in the data
ro <- read.csv("Africa_qualCI.csv")

# note our plurality coding, explained in footnote 3 in the article
ro$t[ro$cname=="Nigeria"] <- 0
# create a variable for any military rule
ro$mil_ni <- ro$perc_mil >.01
ro$mil_ni_5 <- ro$perc_mil >.05
# our sample is SSA in the 1990s
ro <- subset(ro,year < 2000)



###############################################################
## Pair Matching
###############################################################

# exact match on three variables first
pl.em<-exactMatch(t~conflict+protest+mil_ni,data=ro)

# then Mahalanobis distance on the other variables
dist.pl<-match_on(t~loggdppc+fe_etfra,data=ro)

# put these together, with distance within the strata defined by the exact matching variables
m2<-fullmatch(dist.pl+pl.em,data=ro)
data.m2<-cbind(ro[!is.na(m2),c("cname","t", "fe_etfra", "loggdppc", "conflict", "protest", "perc_mil")],m2[!is.na(m2)])
data.m2<-data.m2[order(data.m2[,ncol(data.m2)],-data.m2$t),]
data.m2<-data.m2[,c(dim(data.m2)[2],seq(1,dim(data.m2)[2]-1,1))]
colnames(data.m2)<-c("Pair", "Country", "Treated", "Ethnic Fractionalization", "Log GDP per capita", "Conflict", "Protest", "Military")
# assign s (index of set) to correspond to the article
data.m2$Pair<-c(4,4,2,2,3,3,1,1)
data.m2<-data.m2[order(data.m2$Pair,-data.m2$Treated),]
data.m2
xtable(data.m2,digits=3) # produces table 1 in the SI Section B

##############################################
## SI Section B: Check balance for log GDP per capita
## signed rank statistic for these four pairs

x<-data.frame(data.m2[c(1,3,5,7),c(1,5)], data.m2[c(2,4,6,8),5])
colnames(x)<-c("s","t_gdp", "c_gdp")
x$s1<-sign(x$t_gdp-x$c_gdp)>0
x$s2<-sign(x$c_gdp-x$t_gdp)>0
# between-set ranks
x$q_s<-rank(abs(x$t_gdp-x$c_gdp))
x


m<-matrix(nrow=16,ncol=6)
# each row is a different permutation of treatment assignment

# W1
m[,1]<-c(rep(x$s1[1]*x$q_s[1],8),rep((x$s2[1])*x$q_s[1],8))
# W2
m[,2]<-c(rep(c(rep(x$s1[2]*x$q_s[2],4),rep((x$s2[2])*x$q_s[2],4)),2))
# W3
m[,3]<-c(rep(c(rep(x$s1[3]*x$q_s[3],2),rep((x$s2[3])*x$q_s[3],2)),4))
# W4
m[,4]<-c(rep(c(x$s1[4]*x$q_s[4],(x$s2[4])*x$q_s[4]),8))
# W
m[,5]<-m[,1]+m[,2]+m[,3]+m[,4]
# =1 if observed W >= W for that row
m[,6]<-(m[,5]<=m[1,5])
m

## observed data is the first row, W=1; second row has W=0; p-value: 2/16
sum(m[,6])
sum(m[,6])/dim(m)[1]


##############################################
## SI Section B: Check balance for ethnic fractionalization
## signed rank statistic for these four pairs

x<-data.frame(data.m2[c(1,3,5,7),c(1,4)], data.m2[c(2,4,6,8),4])
colnames(x)<-c("s","t_etfra", "c_etfra")
x$s1<-sign(x$t_etfra-x$c_etfra)>0
x$s2<-sign(x$c_etfra-x$t_etfra)>0
x$q_s<-rank(abs(x$t_etfra-x$c_etfra))
x

m<-matrix(nrow=16,ncol=6)
# each row is a different permutation of treatment assignment

# W1
m[,1]<-c(rep(x$s1[1]*x$q_s[1],8),rep((x$s2[1])*x$q_s[1],8))
# W2
m[,2]<-c(rep(c(rep(x$s1[2]*x$q_s[2],4),rep((x$s2[2])*x$q_s[2],4)),2))
# W3
m[,3]<-c(rep(c(rep(x$s1[3]*x$q_s[3],2),rep((x$s2[3])*x$q_s[3],2)),4))
# W4
m[,4]<-c(rep(c(x$s1[4]*x$q_s[4],(x$s2[4])*x$q_s[4]),8))
# sum up the W
m[,5]<-m[,1]+m[,2]+m[,3]+m[,4]
## observed data is the first row, by construction
m[,6]<-(m[,5]>=m[1,5])
m

## observed data is the first row, W=10; p-value: 1/16
sum(m[,6])
sum(m[,6])/dim(m)[1]



###############################################################
## Analysis with the Paired Design
###############################################################
# The first p-value in the article is produced using an exact version of McNemar's test on the data in Table 1. 

library(exact2x2) # requires packages exactci and ssanv
exact2x2(matrix(c(1,0,2,1),nr=2), alternative="greater", paired=TRUE)

# the first 1 in the matrix corresponds to the zero/zero pair 
#     (Tanzania/Guinea-Bissau)
# the 0 in the matrix corresponds to the lack of a zero/one pair 
# the 2 in the matrix corresponds to the two one/zero pairs 
#     (Cameroon/Gabon, Malawi/Zambia)
# the second 1 in the matrix corresponds to the one/one pair (Kenya/Cote d'Ivoire)

# We produce the second p-value with the signed-rank test on the signs and ranks in Table 2.  We can use the standard Wilcoxon function on hypothetical data that correspond to the signs and ranks of table. 

wilcox.test(c(6,8,4,2),c(3,4,2,1), alternative="greater", paired=TRUE, exact=TRUE)
# Cameroon = 6
# Kenya = 8
# Malawi = 4
# Tanzania = 2
# Gabon = 3
# Cote d'Ivoire = 4
# Zambia = 2
# Guinea-Bissau = 1

# Alternatively, under the assumption that we flip a fair coin for each pair, each row in the permutation distribution in Table 3 has a 1/2*1/2*1/2*1/2 =1/16 chance of occurring under the sharp null hypothesis. Because the observed signed-rank statistic took its maximum value for this example (W=10), the p-value is just the probability for the first row (i.e., 1/16). 

# We can also use the package qualCI to calculate the p-value and qualitative confidence interval: 

library(qualCI)
data(pluralityPairs)
between.ranks <- c(3,4,2,1)
dat <- quade.prep(data=pluralityPairs, set="pair", unit="country", treatment=
"plurality", withinRank="OppHarRank", betweenRank=between.ranks)
dat
qout <- quade(dat)
qout # p-value
qualCI(qout) # qualitative confidence interval for table 4 in the article



###############################################################
## Full Matching (with Madagascar)
###############################################################

pl.em3<-exactMatch(t~fr+conflict+protest+mil_ni_5,data=ro)
dist.pl3<-match_on(t~loggdppc+fe_etfra,data=ro)
m4<-fullmatch(dist.pl3+pl.em3,data=ro)

data.m4<-cbind(ro[!is.na(m4),c("cname","t","fe_etfra","loggdppc", "fr","conflict", "protest", "perc_mil", "priv1_mean")],m4[!is.na(m4)])
data.m4 <-data.m4[order(data.m4[,dim(data.m4)[2]],-data.m4 $t),]
data.m4 <-data.m4[,c(dim(data.m4)[2],seq(1,dim(data.m4)[2]-1,1))]
colnames(data.m4)<-c("Set", "Country", "Treated", "Ethnic Fractionalization", "Log GDP per capita", "French", "Conflict", "Protest", "Military", "Private Credit")
data.m4$Set<-c(3,3,2,2,2,1,1,1,1)
data.m4 <-data.m4[order(data.m4 $Set,-data.m4 $Treated),]
data.m4
xtable(data.m4,digits=3) #produces table 5 in the SI Section E

##############################################
## SI Section E: Check balance for log GDP per capita
## Quade statistic for these three sets

# col 5=log GDP pc

d<-cbind(data.m4[,c(1,5)],rep(NA,dim(data.m4)[1]))
# within-set rank for units in set 1
d[1,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[1]
d[2,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[2]
d[3,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[3]
d[4,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[4]
# within-set rank for units in set 2
d[5,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[1]
d[6,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[2]
d[7,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[3]
# within-set rank for units in set 3
d[8,3]<-rank(c(d[8,2], d[9,2]))[1]
d[9,3]<-rank(c(d[8,2], d[9,2]))[2]

colnames(d)<-c("set","var","rank")

# between-set ranks
q1<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[1]
q2<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[2]
q3<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[3]
colnames(d)<-c("set","fe_etfra","rank")

m<-matrix(NA,ncol=5,nrow=24)

# each row is a different permutation of treatment assignment

m[,1]<-c(rep(q1*d[1,3],6), rep(q1*d[2,3],6), rep(q1*d[3,3],6), rep(q1*d[4,3],6))
m[,2]<-c(rep(c(rep(q2*(d[5,3]+d[6,3]),2), rep(q2*(d[5,3]+d[7,3]),2), rep(q2*(d[6,3]+d[7,3]),2)),4))
m[,3]<-c(rep(c(q3*d[8,3],q3*d[9,3]),12))
m[,4]<-m[,1]+m[,2]+m[,3]
## observed data is the first row, by construction
m[,5]<-(m[,4]<=m[1,4])
## p-value:
sum(m[,5])
sum(m[,5])/dim(m)[1]
## p-value for log GDP per capita = 11/24 (with Madagascar)





##############################################
## SI Section E: Check balance for ethnic fractionalization
## Quade statistic for these three sets

# col 4=ethfrac

d<-cbind(data.m4[,c(1,4)],rep(NA,dim(data.m4)[1]))
# within-set rank for units in set 1
d[1,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[1]
d[2,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[2]
d[3,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[3]
d[4,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[4]
# within-set rank for units in set 2
d[5,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[1]
d[6,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[2]
d[7,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[3]
# within-set rank for units in set 3
d[8,3]<-rank(c(d[8,2], d[9,2]))[1]
d[9,3]<-rank(c(d[8,2], d[9,2]))[2]

colnames(d)<-c("set","var","rank")

# between-set rank for set 1
q1<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[1]
# between-set rank for set 2
q2<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[2]
# between-set rank for set 3
q3<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[3]



m<-matrix(NA,ncol=5,nrow=24)
# each row is a different permutation of treatment assignment

m[,1]<-c(rep(q1*d[1,3],6), rep(q1*d[2,3],6), rep(q1*d[3,3],6), rep(q1*d[4,3],6))
m[,2]<-c(rep(c(rep(q2*(d[5,3]+d[6,3]),2), rep(q2*(d[5,3]+d[7,3]),2), rep(q2*(d[6,3]+d[7,3]),2)),4))
m[,3]<-c(rep(c(q3*d[8,3],q3*d[9,3]),12))
m[,4]<-m[,1]+m[,2]+m[,3]
## observed data is the first row, by construction
m[,5]<-(m[,4]>=m[1,4])
## p-value:
sum(m[,5])
sum(m[,5])/dim(m)[1]
## p-value for ethnic fractionalization = 1/24 (with Madagascar)



##############################################
## SI Section E: Check balance for private capital
## Quade statistic for these three sets

d<-cbind(data.m4[,c(1,10)],rep(NA,dim(data.m4)[1]))
# within-set rank for units in set 1
d[1,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[1]
d[2,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[2]
d[3,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[3]
d[4,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[4]
# within-set rank for units in set 2
d[5,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[1]
d[6,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[2]
d[7,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[3]
# within-set rank for units in set 3
d[8,3]<-rank(c(d[8,2], d[9,2]))[1]
d[9,3]<-rank(c(d[8,2], d[9,2]))[2]

colnames(d)<-c("set","var","rank")

# between-set rank for set 1
q1<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[1]
# between-set rank for set 2
q2<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[2]
# between-set rank for set 3
q3<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
	(d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
	(d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[3]
m<-matrix(NA,ncol=5,nrow=24)

# each row is a different permutation of treatment assignment

m[,1]<-c(rep(q1*d[1,3],6), rep(q1*d[2,3],6), rep(q1*d[3,3],6), rep(q1*d[4,3],6))
m[,2]<-c(rep(c(rep(q2*(d[5,3]+d[6,3]),2), rep(q2*(d[5,3]+d[7,3]),2), 
	rep(q2*(d[6,3]+d[7,3]),2)),4))
m[,3]<-c(rep(c(q3*d[8,3],q3*d[9,3]),12))
m[,4]<-m[,1]+m[,2]+m[,3]
## observed data is the first row, by construction
m[,5]<-(m[,4]<=m[1,4])
## p-value:
sum(m[,5])
sum(m[,5])/dim(m)[1]
## p-value for log GDP per capita = 21/24 =0.875 (with Madagascar)




# We use the Quade statistic to produce the p-value associated with using full matching (Table 6 in article).  This can be calculated by creating hypothetical data that correspond to the between-set and within-set ranks of Table 5. 

# Cameroon = 8
# Gabon = 6
# Cote d'Ivoire = 4
# Madagascar = 2
# Kenya = 10
# Malawi = 6
# Zambia = 3
# Tanzania = 2
# Guinea-Bissau = 1

d<- data.frame(c(1,1,1,1,2,2,2,3,3),c(8,6,4,2,10,6,3,2,1),rep(NA,9))
colnames(d)<-c("set","var","rank")

# within-set rank for units in set 1
d[1,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[1]
d[2,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[2]
d[3,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[3]
d[4,3]<-rank(c(d[1,2], d[2,2], d[3,2], d[4,2]))[4]
# within-set rank for units in set 2
d[5,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[1]
d[6,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[2]
d[7,3]<-rank(c(d[5,2], d[6,2], d[7,2]))[3]
# within-set rank for units in set 3
d[8,3]<-rank(c(d[8,2], d[9,2]))[1]
d[9,3]<-rank(c(d[8,2], d[9,2]))[2]

# between-set rank for set 1
q1<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
    (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
    (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[1]
# between-set rank for set 2
q2<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
    (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
    (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[2]
# between-set rank for set 3
q3<-rank(c((d$var[d$rank==4&d$set==1]-d$var[d$rank==1&d$set==1]), 
    (d$var[d$rank==3&d$set==2]-d$var[d$rank==1&d$set==2]), 
    (d$var[d$rank==2&d$set==3]-d$var[d$rank==1&d$set==3])))[3]

# each row is a different permutation of treatment assignment 

m<-matrix(NA,ncol=5,nrow=24)
# Q1
m[,1]<-c(rep(q1*d[1,3],6), rep(q1*d[2,3],6), rep(q1*d[3,3],6), rep(q1*d[4,3],6))
# Q2
m[,2]<-c(rep(c(rep(q2*(d[5,3]+d[6,3]),2), rep(q2*(d[5,3]+d[7,3]),2), 
    rep(q2*(d[6,3]+d[7,3]),2)),4))
# Q3
m[,3]<-c(rep(c(q3*d[8,3],q3*d[9,3]),12))
# Q
m[,4]<-m[,1]+m[,2]+m[,3] 
# note that by construction, observed data is the first row
m[,5]<-(m[,4]>=m[1,4])
mean(m[,5]) # p-value 1/24



#Again, the observed statistic is the largest possible value of the Quade statistic. Therefore, if the treatment assignment is ``as if'' random for our example, then pi1.1000=1/(n1 choose m1) =1/4 for s=1, pi2.110=1/(n2 choose m2)=1/3 for s=2, and pi3.10 = 1/(n3 choose m3)=1/2 for s=3, and the p-value would be pi1.1000 * pi2.110 * pi3.10 = 1/4 * 1/3 * 1/2 = 1/24.

pi1.1000 <- 1/4
pi2.110 <- 1/3
pi3.10 <- 1/2
pi1.1000*pi2.110*pi3.10 # produces the p-value of 1/24

#We conduct the sensitivity analysis by increasing the values of pi1.1000, pi2.110, or pi3.10. For example, the p-values in Table 7 of the paper can be reproduced with the following:

pi1.1000 <- c(1/4,1.25/4.25,1.5/4.5,2/5,2.5/5.5,3/6)
pi2.110 <- 1/3
pi3.10 <- c(1/2,1.25/2.25,1.5/2.5,2/3,2.5/3.5,3/4)
pi2.110*outer(pi1.1000,pi3.10) # produces the numbers before rounding 

#We can use the qualCI package to produce the p-values for the full matched design with the Quade statistic:  

data(pluralitySets)
between.ranks <- c(2,3,1)
dat <- quade.prep(data=pluralitySets, set="set", treatment="plurality", 
unit="country", withinRank="OppHarRank", betweenRank=between.ranks)
qout <- quade(dat)
qout


#To produce the qualitative confidence interval, 
qualCI(qout)

#This will offer the user several pairs of treated and control countries and ask for which pair the between-set rank was most difficult to determine. The Tanzania / Guinea-Bissau pair was the most difficult to rank.

#The following code will produce the sensitivity analysis in Table 7 using the qualCI package.
pi1 <- list(c(1/4,1/4,1/4,1/4),
c(1.25/4.25,rep((1-1.25/4.25)/3,3)),
c(1.5/4.5,rep((1-1.5/4.5)/3,3)), 
c(2/5,rep(1/5,3)), 
c(2.5/5.5,rep((1-2.5/5.5)/3,3)),
c(3/6,rep(1/6,3)))
pi3 <- list(c(1/2,1/2),
c(1.25/2.25,1-1.25/2.25), 
c(1.5/2.5, 1-1.5/2.5), 
c(2/3, 1/3), 
c(2.5/3.5, 1-2.5/3.5), 
c(3/4, 1/4))
table7 <- matrix(data=NA, nrow=length(pi1), ncol=length(pi3))


for(m in 1:nrow(table7)){
	for(n in 1:ncol(table7)){
		dat[[1]]$prob <- pi1[[m]]
		dat[[3]]$prob <- pi3[[n]]
		table7[m,n] <- quade(dat)$pval
	}
}
table7


